{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# python version: 3.8.5\n",
    "import pandas as pd\n",
    "import spacy\n",
    "from scispacy.abbreviation import AbbreviationDetector\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dataset\n",
    "df = pd.read_json(\"inf_processed_all.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nlp = spacy.load(\"en_core_sci_sm\")\n",
    "nlp.add_pipe(\"abbreviation_detector\")\n",
    "\n",
    "def get_abbreviation_map(text):\n",
    "    text=str(text)\n",
    "    doc = nlp(text)\n",
    "    abbr_dict = {}\n",
    "    for abrv in doc._.abbreviations:\n",
    "        abbr_dict[str(abrv)] = str(abrv._.long_form)\n",
    "    return abbr_dict\n",
    "\n",
    "def process_q_row(qa_list):\n",
    "    qa_list=str(qa_list)\n",
    "    abbr_map = get_abbreviation_map(qa_list)\n",
    "    print(abbr_map)\n",
    "\n",
    "    if abbr_map:\n",
    "        # for i in range(len(qa_list)):\n",
    "            for abbr, full in abbr_map.items():\n",
    "                pattern = r'\\b' + re.escape(abbr) + r'\\b'\n",
    "                qa_list = re.sub(pattern, full, qa_list) \n",
    "    return qa_list\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save abbreviation dictionary to dataframe\n",
    "df['AbbrDict'] = df['Q&A'].apply(get_abbreviation_map)\n",
    "# Save processed Q&A to dataframe\n",
    "df[\"new_Q&A\"] = df[\"Q&A\"].apply(process_q_row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save data to csv&json file \n",
    "df.to_csv('abbr_complement.csv', index=False)\n",
    "df.to_json('abbr_complement.json', orient='records')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "comp9021",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
